import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import hvplot.pandas
from scipy import stats
import sklearn
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
sales=pd.read_csv("C:\\Users\\MAHFOOZ ALAM\\Desktop\\DATASET FOR USAGE\\Sales Prediction.csv")
sales.head()
| TV | Radio | Newspaper | Sales | |
|---|---|---|---|---|
| 0 | 230.1 | 37.8 | 69.2 | 22.1 |
| 1 | 44.5 | 39.3 | 45.1 | 10.4 |
| 2 | 17.2 | 45.9 | 69.3 | 12.0 |
| 3 | 151.5 | 41.3 | 58.5 | 16.5 |
| 4 | 180.8 | 10.8 | 58.4 | 17.9 |
sales.columns
Index(['TV', 'Radio', 'Newspaper', 'Sales'], dtype='object')
sales.shape
(200, 4)
sales.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 200 entries, 0 to 199 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 TV 200 non-null float64 1 Radio 200 non-null float64 2 Newspaper 200 non-null float64 3 Sales 200 non-null float64 dtypes: float64(4) memory usage: 6.4 KB
sales.describe()
| TV | Radio | Newspaper | Sales | |
|---|---|---|---|---|
| count | 200.000000 | 200.000000 | 200.000000 | 200.000000 |
| mean | 147.042500 | 23.264000 | 30.554000 | 15.130500 |
| std | 85.854236 | 14.846809 | 21.778621 | 5.283892 |
| min | 0.700000 | 0.000000 | 0.300000 | 1.600000 |
| 25% | 74.375000 | 9.975000 | 12.750000 | 11.000000 |
| 50% | 149.750000 | 22.900000 | 25.750000 | 16.000000 |
| 75% | 218.825000 | 36.525000 | 45.100000 | 19.050000 |
| max | 296.400000 | 49.600000 | 114.000000 | 27.000000 |
sales.Sales.value_counts()
11.9 5
16.7 5
20.7 4
11.0 3
11.3 3
..
13.4 1
24.2 1
8.1 1
5.5 1
25.5 1
Name: Sales, Length: 121, dtype: int64
sns.countplot(sales['Sales'])
<Axes: ylabel='count'>
sales.isna().sum()
TV 0 Radio 0 Newspaper 0 Sales 0 dtype: int64
data_categorical=sales.loc[:,sales.dtypes== object]
data_categorical
| 0 |
|---|
| 1 |
| 2 |
| 3 |
| 4 |
| ... |
| 195 |
| 196 |
| 197 |
| 198 |
| 199 |
200 rows × 0 columns
data_numerical = sales.select_dtypes(exclude=[object])
data_numerical
| TV | Radio | Newspaper | Sales | |
|---|---|---|---|---|
| 0 | 230.1 | 37.8 | 69.2 | 22.1 |
| 1 | 44.5 | 39.3 | 45.1 | 10.4 |
| 2 | 17.2 | 45.9 | 69.3 | 12.0 |
| 3 | 151.5 | 41.3 | 58.5 | 16.5 |
| 4 | 180.8 | 10.8 | 58.4 | 17.9 |
| ... | ... | ... | ... | ... |
| 195 | 38.2 | 3.7 | 13.8 | 7.6 |
| 196 | 94.2 | 4.9 | 8.1 | 14.0 |
| 197 | 177.0 | 9.3 | 6.4 | 14.8 |
| 198 | 283.6 | 42.0 | 66.2 | 25.5 |
| 199 | 232.1 | 8.6 | 8.7 | 18.4 |
200 rows × 4 columns
df_dup=sales.duplicated().any()
df_dup
False
fig, axs = plt.subplots(3, figsize = (5,5))
plt1 = sns.boxplot(sales['TV'], ax = axs[0])
plt2 = sns.boxplot(sales['Newspaper'], ax = axs[1])
plt3 = sns.boxplot(sales['Radio'], ax = axs[2])
plt.tight_layout()
sns.boxplot(sales['Sales'])
plt.show()
sns.pairplot(sales, x_vars=['TV', 'Newspaper', 'Radio'], y_vars='Sales', height=4, aspect=1, kind='scatter')
<seaborn.axisgrid.PairGrid at 0x16936f87370>
plt.figure(figsize=(10,7))
sns.heatmap(sales.corr(), annot = True)
<Axes: >
X = sales['TV']
y = sales['Sales']
import statsmodels.api as sm
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
lr=LinearRegression()
X_train_sm = sm.add_constant(X_train)
lr = sm.OLS(y_train, X_train_sm).fit()
X_test_sm = sm.add_constant(X_test)
y_pred = lr.predict(X_test_sm)
lr.params
const 7.206555 TV 0.054835 dtype: float64
print(lr.summary())
OLS Regression Results
==============================================================================
Dep. Variable: Sales R-squared: 0.800
Model: OLS Adj. R-squared: 0.798
Method: Least Squares F-statistic: 550.7
Date: Sat, 26 Aug 2023 Prob (F-statistic): 5.08e-50
Time: 18:37:15 Log-Likelihood: -314.94
No. Observations: 140 AIC: 633.9
Df Residuals: 138 BIC: 639.8
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 7.2066 0.414 17.392 0.000 6.387 8.026
TV 0.0548 0.002 23.467 0.000 0.050 0.059
==============================================================================
Omnibus: 1.138 Durbin-Watson: 2.351
Prob(Omnibus): 0.566 Jarque-Bera (JB): 1.240
Skew: -0.190 Prob(JB): 0.538
Kurtosis: 2.739 Cond. No. 376.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
plt.scatter(X_test, y_test)
plt.plot(X_test, 7.2066 + 0.0543*X_test, 'r')
plt.show()
import sklearn.metrics as metrics
print('MAE: {}'.format(metrics.mean_absolute_error(y_test, y_pred)))
print('MSE: {}'.format(metrics.mean_squared_error(y_test, y_pred)))
print('RMSE: {}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))
MAE: 1.806912377664152 MSE: 5.1795254021666555 RMSE: 2.2758570698017606
sns.distplot((y_test-y_pred))
C:\Users\MAHFOOZ ALAM\AppData\Local\Temp\ipykernel_12304\2077165099.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot((y_test-y_pred))
<Axes: ylabel='Density'>
res = (y_test - y_pred)
plt.scatter(X_test,res)
plt.show()
X_test_sm = sm.add_constant(X_test)
y_pred = lr.predict(X_test_sm)
y_pred.head()
95 16.161091 15 17.921291 30 23.267692 158 7.848123 128 19.286679 dtype: float64
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
np.sqrt(mean_squared_error(y_test, y_pred))
2.2758570698017606
r_squared = r2_score(y_test, y_pred)
r_squared
0.814855389208679
plt.scatter(X_test, y_test)
plt.plot(X_test, 6.948 + 0.054 * X_test, 'r')
plt.show()